options(repos = "http://cran.us.r-project.org") 
# Install and load required packages
required_packages <- c("ggpubr", "tidyverse", "ggplot2", "plotly", "moments", "DT", "LambertW")
for (package in required_packages) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package)
    library(package, character.only = TRUE)
  }
}

1. Reading the gapminder into a tibble

gapminder <- read_csv("gapminder_clean.csv")

2. Filtering and plotting the data

gapminder_filtered <- filter(gapminder, Year == 1962)

gapminder_plot <- ggplot(gapminder_filtered, aes(x = log(gdpPercap),
  y = log(`CO2 emissions (metric tons per capita)`))) +
  labs(x = 'Log base 10 transform of gdpPercap', y = 'Log base 10 transform of CO2 emissions (metric tons per capita)')+
  geom_point()+ 
  theme(text = element_text(size = 10))     
  
gapminder_plot

3. Finding correlation

correlation <- cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
  gapminder_filtered$gdpPercap, use = "complete.obs")
p_value <- cor.test(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
  gapminder_filtered$gdpPercap)$p.value
print(paste("The correlation coefficient is", as.character(correlation)))
## [1] "The correlation coefficient is 0.926081672501945"
print(paste("The p-value is", as.character(p_value)))
## [1] "The p-value is 1.1286792210055e-46"

4. On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…

gapminder_filtered <- gapminder %>%
  filter(!is.na(`CO2 emissions (metric tons per capita)`), !is.na(gdpPercap))

cor_by_year <- gapminder_filtered %>%
  group_by(Year) %>%
  summarise(correlation = cor(`CO2 emissions (metric tons per capita)`, gdpPercap, use = "complete.obs"))

datatable(cor_by_year)
strongest_year_row <- cor_by_year %>%
  filter(correlation == max(correlation)) %>%
  slice(1)

strongest_year <- strongest_year_row
datatable(strongest_year)

5. Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.

gapminder_strongest_year <- gapminder %>%
  filter(Year == 1967) 

strongest_year_plot <- ggplot(gapminder_strongest_year, 
  aes(x = log(gdpPercap), y = log(`CO2 emissions (metric tons per capita)`), 
  color = continent,
  size = pop)) +
  geom_point()
ggplotly(strongest_year_plot)

1. What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’? (stats test needed)

We want to test if there is a statistically significant difference in the Energy use per continent. Thus, continent is the predictor variable, and energy use is the outcome variable.

To use a parametric test, we must ensure that three assumptions are met: Normality, equal variances, and independence.

We visually assess normality using histograms of energy use for each continent

ggplot(gapminder, aes(x = `Energy use (kg of oil equivalent per capita)`)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ continent, scales = "free") +
  xlab("Energy use (kg of oil equivalent per capita)") +
  ylab("Frequency")

Visually, the data is not normally distributed. Therefore, we use the non-parametric Kruskal-Wallis test

kruskal.test(gapminder$`Energy use (kg of oil equivalent per capita)`, gapminder$continent, na.action = "na.omit")
## 
##  Kruskal-Wallis rank sum test
## 
## data:  gapminder$`Energy use (kg of oil equivalent per capita)` and gapminder$continent
## Kruskal-Wallis chi-squared = 318.68, df = 4, p-value < 2.2e-16

As we can see, the p value is less than 2.2e-16, which is less than 0.05, which means that the energy use varies significantly between at least two continents.

2. Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)

Filtering the data

gapminder_years <- gapminder %>%
  filter(Year > 1990) %>%
  filter(continent == "Asia" | continent == "Europe")

Visualizations

# Create box plots

box_plot <- ggplot(gapminder_years, aes(x = continent, y = `Imports of goods and services (% of GDP)`, fill = continent)) +
  geom_boxplot() +
  labs(x = "Continent", y = "Imports of goods and services (% of GDP)", fill = "Continent") +
  ggtitle("Box Plot of GDP Imports by Continent")

ggplotly(box_plot)
# Create density plots
density_plot <- ggplot(gapminder_years, aes(x = `Imports of goods and services (% of GDP)`, fill = continent)) +
  geom_density(alpha = 0.5) +
  labs(x = "Imports of goods and services (% of GDP)", fill = "Continent") +
  ggtitle("Density Plot of GDP Imports by Continent")

ggplotly(density_plot)

Visually, the two continent’s import of goods and services are very close with overlapping peaks, although the variances appear to be different. There appears to be 4 outliers in Asia.

Plotting a qqplot

# Filter data for the years after 1990
data <- gapminder_years

# Plot Q-Q plot with facet by continent
ggplotly(ggqqplot(data = gapminder_years, x = "`Imports of goods and services (% of GDP)`", facet.by = "continent"))

For Asia, there are a few points with a high GDP above the diagonal line. As normality has been violated, it would not be appropriate to use a parametric test, so we use the non-parametric Mann-Whitney-Wilcoxon Test.

Perform Mann-Whitney-Wilcoxon Test

result <- wilcox.test(`Imports of goods and services (% of GDP)` ~ continent, data = gapminder_years)
print(result)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Imports of goods and services (% of GDP) by continent
## W = 5707, p-value = 0.7867
## alternative hypothesis: true location shift is not equal to 0

As the p value of 0.7867 is greater than 0.05, we did not find a significant difference in ‘Imports of goods and services (% of GDP)’ between Europe and Asia.

3. What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

# Getting the ranked populations descending (the country with the greatest population density would be #1)
gapminder_pd <- gapminder %>%
  group_by(`Year`) %>%
  mutate(population_rank = rank(-`Population density (people per sq. km of land area)`)) %>%
  dplyr::select(population_rank, `Country Name`)
## Adding missing grouping variables: `Year`
# Taking the average population rank

gapminder_rank_mean <- gapminder_pd %>%
  group_by(`Country Name`) %>%
  summarize(mean_population_density_rank = mean(population_rank)) %>%
  arrange(mean_population_density_rank) %>%
  slice(1:5)

gapminder_rank_mean_plot <- ggplot(data = gapminder_rank_mean, aes(x = `Country Name`, y = mean_population_density_rank)) + 
                                     geom_bar(stat = 'Identity') +
  labs(title = "Countries with greatest average ranking for population density", y = 'Mean rank of country')
gapminder_rank_mean_plot

datatable(gapminder_rank_mean)

As seen from bar chart, Macao SAR, China and Monaco are tied for having the highest 'Population density (people per sq. km of land area)' across all years; each had an average rank of 1.5.

4. What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007?

# Get the top 5 countries with the greatest increase in life expectancies
gapminder_difference <-gapminder %>%
  filter(Year %in% c(1962, 2007)) %>%
  group_by(`Country Name`)%>%
  arrange((`Life expectancy at birth, total (years)`)) %>%
  reframe(`Difference in Life expectancy (2007 - 1962)` = diff(`Life expectancy at birth, total (years)`)) %>%
  arrange(desc(`Difference in Life expectancy (2007 - 1962)`)) %>%
  slice(1:5)

# Plotting the top 5 countries
gapminder_difference_plot <- gapminder_difference %>%
  ggplot(aes(x = `Country Name`,  y = `Difference in Life expectancy (2007 - 1962)`)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 countries with the greatest increase in life expectancy from 1962 to 2007", y = 'Difference in Life expectancy in years for (2007 - 1962)')

ggplotly(gapminder_difference_plot)

As seen from the above bar chart, the country whose life expectancy increased the most from 1962 - 2007 is Maldives